from urllib.request import ProxyHandler, build_opener

import requests
from fake_useragent import UserAgent
from random import randint
from time import sleep
from pyquery import PyQuery as pq

def get_html(url):
    headers = {
        "User-Agent": UserAgent().random
    }
    # s随机睡眠3-10秒
    sleep(randint(5, 10))

    response = requests.get(url, headers=headers)

    response.encoding = 'utf-8'
    # print(response.text)
    if response.status_code == 200:
        return response.text
    else:
        return None


def parse_index(html):
    doc=pq(html)
    all_a = doc('.channel-detail.movie-item-title a')
    all_url = []
    for a in all_a:
        all_url.append(a.attrib['href'])
    return ['http://maoyan.com{}'.format(url) for url in all_url]


def parse_info(html):
    doc= pq(html)
    name = doc("h1.name")[0].text
    types = doc('li.ellipsis')[0].text
    actors = doc('li.celebrity.actor > div.info > a')
    actors = format_actors(actors)
    return {
        "name": name,
        "type": types,
        "actor": actors
    }


def format_actors(actors_a):
    actor_set = set()
    for a in actors_a:
        actor_set.add(a.text.strip())
    return actor_set


def main():
    # index_url = 'https://maoyan.com/films'
    index_url = 'https://maoyan.com/films?catId=2&sourceId=2&showType=3'
    html = get_html(index_url)
    movie_urls = parse_index(html)
    for url in movie_urls:
        movie_html = get_html(url)
        movie = parse_info(movie_html)
        print(movie)

    print(movie_urls)


if __name__ == '__main__':
    main()
